# import library ---- 
library(readr)
# require(graphics)
# source("Assignment_1_my_kmeans.r")  # when done use source my_kmeans from Assignment_1_my_kmenas.r

# read data from file ----
dat <-  read_csv("Player_Attributes.csv", col_names = TRUE, col_types = NULL )   # from csv file
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   date = col_character(),
##   preferred_foot = col_character(),
##   attacking_work_rate = col_character(),
##   defensive_work_rate = col_character()
## )
## See spec(...) for full column specifications.
Y <- na.omit(dat)
# Y <- as.matrix(dat)  # turn data into a matrix
X <- cbind(Y[ , 5:6], Y[ , 10:42])  # get numeric cols only
X <- as.matrix(X)  # turn data into a matrix
# <- as.numeric(X)
# X <- X[!is.na(X)]
X <- X[sample(1:nrow(X), 1000), ]
coln <- colnames(X) # column names of data 

# evaluate for k argument ----
# run kmeans() from R in each loop from K = 1 through 8, saving the tot.withinss for each loop
K <- c() # initialize vector for Elbow Graph
TOTAL_WITHINSS <- c() # initialize vector for Elbow Graph, cl$tot.withinss
for (i in 2:8){
  cl <- kmeans(X, i, nstart = 25)  # replace kmenas with my_kmeans 
  title = paste("Kmeans Clustering from R: nstart = 25, k = ", as.character(i))
  plot( X[,31], X[,1], col = cl$cluster, main = title, xlab = coln[31], ylab = coln[1]) # all data points colored by cluster
  # points(cl$centers, col = 80, pch = 13, cex = 7, lwd = 2)  # the center of each cluster
  K <- c(K,i)
  TOTAL_WITHINSS <- c(TOTAL_WITHINSS, cl$tot.withinss)
}

# create elbow graph ----
plot(K, TOTAL_WITHINSS, main = "Elbow Graph, nstart = 25")
lines(K, TOTAL_WITHINSS)

# determine k value ----
# Evaluate the elbow graph and examine the visualization of the clustering

# run kmeans() ---- for 2 clusters
cl <- kmeans(X, 2, nstart = 25) # replace kmenas with my_kmeans 
coln <- colnames(X)
# plot clusters and centriods
for (i in 3:35){
  title = paste("2 Clusters: ", coln[i], "Against", coln[1])
  plot(X[,i], X[,1],  col = cl$cluster, main = title, xlab = coln[i], ylab = coln[1]) # all data points colored by cluster
  # points(cl$centers, col = 80, pch = 13, cex = 7, lwd = 2)  # the center of each cluster
}

print(paste("betweeness for 2 clusters", cl$betweenss))
## [1] "betweeness for 2 clusters 3410761.68514216"
print(paste("cluster size =", cl$size))
## [1] "cluster size = 85"  "cluster size = 915"
print(paste("cluster withinss =", cl$withinss))
## [1] "cluster withinss = 257883.6"        
## [2] "cluster withinss = 5665105.62185784"
# run kmeans() ---- for 3 clusters
cl <- kmeans(X, 3, nstart = 25) # replace kmenas with my_kmeans 

# plot clusters and centriods
for (i in 3:35){
  title = paste("3 Clusters: ", coln[i], "Against", coln[1])
  plot(X[,i], X[,1], col = cl$cluster, main = title, xlab = coln[i], ylab = coln[1]) # all data points colored by cluster
  # points(cl$centers, col = 80, pch = 13, cex = 7, lwd = 2)  # the center of each cluster
}

print(paste("betweeness for 3 clusters", cl$betweenss))
## [1] "betweeness for 3 clusters 5025379.10081075"
print(paste("cluster size =", cl$size))
## [1] "cluster size = 85"  "cluster size = 421" "cluster size = 494"
print(paste("cluster withinss =", cl$withinss))
## [1] "cluster withinss = 257883.6"        
## [2] "cluster withinss = 1786238.65558194"
## [3] "cluster withinss = 2264249.55060732"
# print all kmeans components ----
# print(cl)